In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from munging import session
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
In [3]:
## load data
custdata = pd.read_table("data/orange_small_train.data.gz",
compression="gzip", na_values=["NA", ""],
delimiter = "\t", header = 0)
churn = np.loadtxt("data/orange_small_train_churn.labels.txt")
custdata["Churn"] = np.where(churn==1, "churn", "nochurn")
custdata.head(n = 3)
Out[3]:
In [4]:
custdata.columns
Out[4]:
In [58]:
custsession = session.Session(custdata, 'Churn')
In [6]:
numerical_feats = custsession.find_numerical_features()
categorical_feats = custsession.find_categorical_features()
na_feats = custsession.find_na_features()
print 'numerical:', numerical_feats
print 'categorical:', categorical_feats
print 'with_missing_values:', na_feats
In [7]:
custdata.columns - np.union1d(numerical_feats, categorical_feats)
Out[7]:
In [9]:
skewed_feats = custsession.find_skewed_features()
noninformative_feats = custsession.find_noninformative_features()
print 'skewed numerical feats:', skewed_feats
print 'noninformative_feats:', noninformative_feats
In [10]:
pd.value_counts(custdata.Var196)
Out[10]:
In [11]:
custsession.plot_feature_density([f for f in skewed_feats if f not in noninformative_feats],
kind="density")
In [12]:
custsession.plot_feature_pair("Churn", "Var153")
In [13]:
custsession.plot_feature_pair( "Var197", "Churn", figsize=(6, 6))
In [63]:
remover = custsession.remove_features(noninformative_feats)
print custsession.find_noninformative_features()
print custsession.find_skewed_features()
In [15]:
print custsession.get_features()
print custsession.get_train_data().shape
print custsession.get_validation_data().shape
In [16]:
custsession.get_crossvalue_table(["Var229"], ["Churn"])
Out[16]:
In [17]:
custsession.get_crossvalue_table(["Var229", "Var197"], ["Churn"])
Out[17]:
In [18]:
custsession.get_crossvalue_table(["Var219"], ["Churn"])
Out[18]:
In [19]:
custsession.plot_feature_pair("Var21", "Var22")
In [56]:
custsession.plot_feature_pair("Var21", "Var160")
In [65]:
custsession.find_redundant_features()
Out[65]:
In [54]:
cmatrix = custsession.data.corr().abs()
for i in xrange(cmatrix.shape[0]):
cmatrix.iloc[i,i] = 0
In [55]:
cmatrix.loc[["Var21", "Var22", "Var160"], ["Var21", "Var22", "Var160"]]
Out[55]:
In [52]:
mean_corr = cmatrix.mean(axis = 0)
In [53]:
removed_feats = []
while True:
max_corr = np.asarray(cmatrix).max()
if max_corr <= 0.90:
break
f1, f2 = cmatrix.columns[np.where(cmatrix == max_corr)[0]]
print f1, f2
feat_to_remove = f1 if mean_corr[f1] > mean_corr[f2] else f2
removed_feats.append(feat_to_remove)
cmatrix.loc[:, feat_to_remove] = 0
cmatrix.loc[feat_to_remove, :] = 0
In [50]:
removed_feats
Out[50]:
In [29]:
np.where(cmatrix == np.asarray(cmatrix).max())
Out[29]:
In [32]:
cmatrix.iloc[:, [3, 4]].mean()
Out[32]:
In [39]:
cmatrix.loc["Var6", "Var7"]
Out[39]:
In [ ]: